Dataset Preprocessing¶

Library loading¶

In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import yaml
import os
import warnings
import socket
import matplotlib as plt


warnings.filterwarnings('ignore')
In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))
scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.22.2 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3
figure.dpi 50
savefig.dpi 500
figure.figsize [10, 10]
axes.facecolor white
dotSize 20
In [3]:
DSname="UpD100_1"
DSDirName="Sample_S20812_258"

Configure paths¶

In [4]:
outdir = "../data/output"
if not os.path.exists(outdir):
   # Create a new directory because it does not exist
   os.makedirs(outdir)
    
if not os.path.exists("../data/output/adatas"):
   # Create a new directory because it does not exist
   os.makedirs("../data/output/adatas")

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
results_file = outdir+'/'+DSname+'.h5ad'
In [5]:
scanpyObj = sc.read_10x_mtx('../data/'+DSDirName+'/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

scanpyObj.obs['dataset'] = DSname
scanpyObj.obs_names = [i + "_" + j for i, j in zip(scanpyObj.obs_names.tolist(), scanpyObj.obs["dataset"].tolist())]
... reading from cache file cache/..-data-Sample_S20812_258-filtered_feature_bc_matrix-matrix.h5ad
In [6]:
scanpyObj.var_names_make_unique()
scanpyObj.obs_names_make_unique()

Importing cellIDs¶

In [7]:
cellID = pd.read_csv('../data/'+DSDirName+'/aggregatedCall/aggregatedCall.tsv', sep ="\t",index_col = 0)
cellID
cellID.index = [i + "_" + DSname for i in cellID.index.tolist()]
In [8]:
cellID.Consensus.unique()
Out[8]:
array(['809', 'MIFF1', 'KOLF', '3391B', 'doublet', 'LowQuality'],
      dtype=object)
In [9]:
scanpyObj.obs['cellID'] = cellID.loc[scanpyObj.obs_names, 'Consensus']
In [10]:
scanpyObj.obs
Out[10]:
dataset cellID
AAACCTGCACGACGAA-1_UpD100_1 UpD100_1 809
AAACCTGCAGGCGATA-1_UpD100_1 UpD100_1 MIFF1
AAACCTGCATTGGTAC-1_UpD100_1 UpD100_1 MIFF1
AAACCTGGTCACAAGG-1_UpD100_1 UpD100_1 KOLF
AAACCTGGTCCGCTGA-1_UpD100_1 UpD100_1 KOLF
... ... ...
TTTGTCAGTTCACGGC-1_UpD100_1 UpD100_1 MIFF1
TTTGTCAGTTCCGGCA-1_UpD100_1 UpD100_1 MIFF1
TTTGTCATCAGCTGGC-1_UpD100_1 UpD100_1 LowQuality
TTTGTCATCGCAGGCT-1_UpD100_1 UpD100_1 809
TTTGTCATCTGTACGA-1_UpD100_1 UpD100_1 809

4860 rows × 2 columns

Configure colors¶

In [11]:
cellID_colors = {}
cellID_newName_colors = {}
cellID_newNames = {}


for line in iPSC_lines_map.keys():
    cellID_colors[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["color"]
    cellID_newName_colors[iPSC_lines_map[line]["newName"]] = iPSC_lines_map[line]["color"]
    cellID_newNames[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["newName"]

scanpyObj.obs["cellID"] = scanpyObj.obs["cellID"].astype("category")
scanpyObj.obs["cellID_newName"] = scanpyObj.obs["cellID"].replace(cellID_newNames, inplace=False).astype("category")
scanpyObj.uns["cellID_colors"] = [cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
scanpyObj.uns["cellID_newName_colors"] = [cellID_newName_colors[line] for line in scanpyObj.obs["cellID_newName"].cat.categories]
In [12]:
[cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
Out[12]:
['#DBB807', '#FF0054', '#0FB248', '#a8a5a5', '#7B00FF', '#403b3b']

Preprocessing¶

Show those genes that yield the highest fraction of counts in each single cells, across all cells.

In [13]:
sc.pl.highest_expr_genes(scanpyObj, n_top=20 )
normalizing counts per cell
    finished (0:00:00)

Subsetting according to good barcodes¶

In [14]:
goodBarcodes=pd.read_csv(outdir+'/'+DSname+'_filteredCells.tsv', sep="\t")["BCs"]+"_"+DSname
scanpyObj = scanpyObj[goodBarcodes,:]

QC metrices¶

In [15]:
scanpyObj.var['mt'] = scanpyObj.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)

scanpyObj.var['ribo'] = scanpyObj.var_names.str.startswith('RP')  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['ribo'], percent_top=None, log1p=True, inplace=True)
In [16]:
scanpyObj
Out[16]:
AnnData object with n_obs × n_vars = 2971 × 33538
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo'
    uns: 'cellID_colors', 'cellID_newName_colors'
In [17]:
scanpyObj.var
Out[17]:
gene_ids feature_types mt n_cells_by_counts mean_counts log1p_mean_counts pct_dropout_by_counts total_counts log1p_total_counts ribo
MIR1302-2HG ENSG00000243485 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM138A ENSG00000237613 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
OR4F5 ENSG00000186092 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AL627309.1 ENSG00000238009 Gene Expression False 35 0.011781 0.011712 98.821945 35.0 3.583519 False
AL627309.3 ENSG00000239945 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
... ... ... ... ... ... ... ... ... ... ...
AC233755.2 ENSG00000277856 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC233755.1 ENSG00000275063 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC240274.1 ENSG00000271254 Gene Expression False 332 0.129923 0.122149 88.825311 386.0 5.958425 False
AC213203.1 ENSG00000277475 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM231C ENSG00000268674 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False

33538 rows × 10 columns

In [18]:
scanpyObj.obs
Out[18]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGCACGACGAA-1_UpD100_1 UpD100_1 809 CTL04E 3077 8.032035 13850.0 9.536113 292.0 5.680172 2.108303 6552.0 8.787679 47.306858
AAACCTGCAGGCGATA-1_UpD100_1 UpD100_1 MIFF1 CTL02A 2844 7.953318 6054.0 8.708640 13.0 2.639057 0.214734 316.0 5.758902 5.219689
AAACCTGCATTGGTAC-1_UpD100_1 UpD100_1 MIFF1 CTL02A 2389 7.779049 7503.0 8.923191 168.0 5.129899 2.239104 2720.0 7.908755 36.252167
AAACCTGGTCCGCTGA-1_UpD100_1 UpD100_1 KOLF CTL08A 3115 8.044305 7486.0 8.920923 42.0 3.761200 0.561047 758.0 6.632002 10.125567
AAACCTGGTGCGGTAA-1_UpD100_1 UpD100_1 MIFF1 CTL02A 1722 7.451822 3645.0 8.201385 16.0 2.833213 0.438957 325.0 5.786897 8.916324
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCACACAGGCCT-1_UpD100_1 UpD100_1 3391B CTL01 3232 8.081166 7564.0 8.931288 4.0 1.609438 0.052882 418.0 6.037871 5.526176
TTTGTCACAGGCTCAC-1_UpD100_1 UpD100_1 MIFF1 CTL02A 2605 7.865572 4563.0 8.425955 83.0 4.430817 1.818979 187.0 5.236442 4.098181
TTTGTCACATGAGCGA-1_UpD100_1 UpD100_1 809 CTL04E 2115 7.657283 6213.0 8.734560 251.0 5.529429 4.039917 2053.0 7.627544 33.043617
TTTGTCACATGGTCTA-1_UpD100_1 UpD100_1 KOLF CTL08A 930 6.836259 1324.0 7.189168 23.0 3.178054 1.737160 134.0 4.905275 10.120846
TTTGTCAGTTCCGGCA-1_UpD100_1 UpD100_1 MIFF1 CTL02A 4128 8.325791 18370.0 9.818528 326.0 5.789960 1.774633 5564.0 8.624252 30.288515

2971 rows × 13 columns

In [19]:
sc.pl.violin(scanpyObj, ['n_genes_by_counts', 'total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['total_counts_mt','total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [20]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [21]:
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_ribo')
In [22]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [23]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [24]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True)
In [25]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [26]:
scanpyObj.write_h5ad(outdir+'/adatas/'+DSname+'_raw.h5ad')
In [27]:
sc.pp.normalize_total(scanpyObj, exclude_highly_expressed=True, max_fraction=.1)
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['MALAT1']
    finished (0:00:00)
In [28]:
sc.pp.log1p(scanpyObj)
In [29]:
scanpyObj.raw = scanpyObj

Subset according to JointHVGs and Filtered Barcodes from previous step¶

In [30]:
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated.txt", sep = "\t")["HVG"]

scanpyObj = scanpyObj[:,HVGs]
scanpyObj.var["highly_variable"] = True
In [31]:
#sc.pp.highly_variable_genes(scanpyObj, min_mean=0.0125, max_mean=5, min_disp=0.5)
In [32]:
#scanpyObj = scanpyObj[:, HVG.tolist()]

#Multiplexing = Multiplexing[:, Multiplexing.var.highly_variable]
In [33]:
#sc.pl.highly_variable_genes(scanpyObj)

Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.

In [34]:
sc.pp.regress_out(scanpyObj, ['total_counts',"pct_counts_mt"])
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:00:09)

Scale each gene to unit variance. Clip values exceeding standard deviation 10.

In [35]:
sc.pp.scale(scanpyObj, max_value=20)
In [36]:
scanpyObj.obs
Out[36]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGCACGACGAA-1_UpD100_1 UpD100_1 809 CTL04E 3077 8.032035 13850.0 9.536113 292.0 5.680172 2.108303 6552.0 8.787679 47.306858
AAACCTGCAGGCGATA-1_UpD100_1 UpD100_1 MIFF1 CTL02A 2844 7.953318 6054.0 8.708640 13.0 2.639057 0.214734 316.0 5.758902 5.219689
AAACCTGCATTGGTAC-1_UpD100_1 UpD100_1 MIFF1 CTL02A 2389 7.779049 7503.0 8.923191 168.0 5.129899 2.239104 2720.0 7.908755 36.252167
AAACCTGGTCCGCTGA-1_UpD100_1 UpD100_1 KOLF CTL08A 3115 8.044305 7486.0 8.920923 42.0 3.761200 0.561047 758.0 6.632002 10.125567
AAACCTGGTGCGGTAA-1_UpD100_1 UpD100_1 MIFF1 CTL02A 1722 7.451822 3645.0 8.201385 16.0 2.833213 0.438957 325.0 5.786897 8.916324
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCACACAGGCCT-1_UpD100_1 UpD100_1 3391B CTL01 3232 8.081166 7564.0 8.931288 4.0 1.609438 0.052882 418.0 6.037871 5.526176
TTTGTCACAGGCTCAC-1_UpD100_1 UpD100_1 MIFF1 CTL02A 2605 7.865572 4563.0 8.425955 83.0 4.430817 1.818979 187.0 5.236442 4.098181
TTTGTCACATGAGCGA-1_UpD100_1 UpD100_1 809 CTL04E 2115 7.657283 6213.0 8.734560 251.0 5.529429 4.039917 2053.0 7.627544 33.043617
TTTGTCACATGGTCTA-1_UpD100_1 UpD100_1 KOLF CTL08A 930 6.836259 1324.0 7.189168 23.0 3.178054 1.737160 134.0 4.905275 10.120846
TTTGTCAGTTCCGGCA-1_UpD100_1 UpD100_1 MIFF1 CTL02A 4128 8.325791 18370.0 9.818528 326.0 5.789960 1.774633 5564.0 8.624252 30.288515

2971 rows × 13 columns

Principal component analysis¶

In [37]:
sc.tl.pca(scanpyObj, svd_solver='arpack')
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
In [38]:
sc.pl.pca(scanpyObj, color='MKI67')
In [39]:
sc.pl.pca_variance_ratio(scanpyObj, log=True)
In [40]:
scanpyObj
Out[40]:
AnnData object with n_obs × n_vars = 2971 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

Computing the neighborhood graph¶

In [41]:
sc.pp.neighbors(scanpyObj, n_neighbors=10, n_pcs=9)
computing neighbors
    using 'X_pca' with n_pcs = 9
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)

Embedding the neighborhood graph¶

In [42]:
sc.tl.umap(scanpyObj)
scanpyObj
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:03)
Out[42]:
AnnData object with n_obs × n_vars = 2971 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [43]:
sc.pl.umap(scanpyObj, color=['MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [44]:
sc.tl.diffmap(scanpyObj)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.9944271  0.99305123 0.99133533 0.9903787  0.9883785
     0.9853552  0.97960895 0.97437984 0.97234106 0.9711082  0.9658007
     0.9601472  0.9560657  0.95214266]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
In [45]:
sc.tl.dpt(scanpyObj)
WARNING: No root cell found. To compute pseudotime, pass the index or expression vector of a root cell, one of:
    adata.uns['iroot'] = root_cell_index
    adata.var['xroot'] = adata[root_cell_name, :].X
computing Diffusion Pseudotime using n_dcs=10
    finished: added
 (0:00:00)
In [46]:
sc.pl.diffmap(scanpyObj, color=[ 'MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [47]:
scanpyObj.X.max()
Out[47]:
20.0